###############################################################################-
# Author: Pietryka
# Contact: matthew.pietryka@gmail.com
# Creation Date:  2025-06-11
# Purpose: Tables 1-3, descriptive stats for our samples
# Notes:   Code for Table 1 provided, but we cannot share the data b/c it could
#          be used to identify individual participants
###############################################################################-



#  1. Load Packages =====================

library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(stringr)
library(modelsummary)
library(janitor)
library(stringr)
library(gt)


# function to identify identical values for focal student and roommate
same_vals <- function(focal, roommate){
  if_else(focal == roommate, 1L, 0L, missing = 0L)
}

# function to measure difference between focal student and roommate on a given variable

absdiff_vals <- function(focal, roommate){
  abs(focal - roommate)
}

# 2. Load Data ====================================



indiv18_df <- read_rds("data-files/indiv18_df.rds")
room_affil_random_df <- read_rds("data-files/room_affil_random_df.rds")
observed_dyads_df <- read_rds("data-files/observed_dyads_df.rds")
id18_random_df <- read_rds("data-files/id18_random_df.rds")
id18_selected_df <- read_rds("data-files/id18_selected_df.rds")
id18_term_random_df <- read_rds("data-files/id18_term_random_df.rds")
selected_dyads_df <- read_rds("data-files/selected_dyads_df.rds")
tufts_df <- read_csv(
  "data/derived/tufts-summary-v2.csv", 
  col_select = c(Group:prop_hispanic),
  n_max = 3
  )

# combine random and self-selected dyads
dyads_df <- observed_dyads_df  %>% 
  mutate(sample_type = "Random")  %>% 
  bind_rows(selected_dyads_df)  %>% 
  mutate(sample_type = replace_na(sample_type, "Self-selected")) 

# 3. Print Descriptive Stats ============================================


# In total, we collected consent from 2,287 students who were over 18 years old
# by Election Day
indiv18_df  %>%
  distinct(anon_id)  %>%
  nrow()

# Of these students, 1,310 were randomly assigned
indiv18_df  %>% 
  inner_join(id18_term_random_df)  %>% 
  distinct(anon_id)  %>% 
  nrow()

# but only 423 shared a room with at least one other student who also provided
# consent
room_affil_random_df  %>% 
  distinct(anon_id)  %>% 
  nrow()


# Most of these students (N = 396) lived with only a single roommate, while the
# remaining 27 students roomed with two other students
room_affil_random_df  %>% 
  distinct(anon_id, .keep_all = TRUE)  %>% 
  count(anon_address, term_code)  %>% 
  group_by(n)  %>% 
  summarise(
    n_rooms = n(),
    n_individuals = sum(n)
    )
  
  

# These individuals formed 225 undirected dyads
observed_dyads_df  %>% 
  nrow()



# 4. Format the Tables =========================================


# Set theme for `gt` tables
custom_theme <- function(x, ...) {
  x %>% 
    opt_row_striping(row_striping = TRUE) %>% 
    opt_table_font(
      font = list(
        google_font("Arial Narrow"),
        default_fonts()
      )
    )  
}
options("modelsummary_theme_gt" = custom_theme)


## Table 1: Dyadic data ---------------

# (To preserve anonymity, cannot share data required to replicate this table)

if (FALSE) {
  
  # measure dyadic difference/similarity
  dyad_summary_df <- dyads_df %>% 
    mutate(
      same_highschool           = same_vals(high_school_id_f, high_school_id_rm),
      same_race                 = same_vals(race_imp_f, race_imp_rm),
      absdiff_age               = absdiff_vals(age_f, age_rm),
      absdiff_parents_turnout   = absdiff_vals(parents_turnout_mean0_f, parents_turnout_mean0_rm),
      absdiff_zip_med_age       = absdiff_vals(zip_med_age_f, zip_med_age_rm),
      absdiff_zip_minority_prop = absdiff_vals(zip_minority_prop_f, zip_minority_prop_rm),
      absdiff_zip_degree_prop   = absdiff_vals(zip_degree_prop_f, zip_degree_prop_rm),
      absdiff_zip_med_income    = absdiff_vals(zip_med_income_f, zip_med_income_rm)
    )  
  
  
  dyad_summary_df  %>% 
    group_by(sample_type)  %>% 
    summarise(
      across(
        c(starts_with("same_"), starts_with("absdiff")), 
        ~mean(.x, na.rm = TRUE)
        )
      )
  
  dyad_balance_df <- dyad_summary_df %>%
    select(
      sample_type,
      `Attended same highschool (0 = different; 1 = same)` = same_highschool,
      `Same race/ethnicity (0 = different; 1 = same)` = same_race,
      `Difference in age (in years)` = absdiff_age,
      `Difference in parents' turnout, 2008-2014` = absdiff_parents_turnout,
      `Difference in zip code median age` = absdiff_zip_med_age,
      `Difference in zip code minority population` = absdiff_zip_minority_prop,
      `Difference in zip code education` = absdiff_zip_degree_prop,
      `Difference in zip code median income` = absdiff_zip_med_income
    )
  
  summary_dyads <- datasummary_balance(~sample_type,
                                       data = dyad_balance_df,
                                       fmt = fmt_significant(2),
                                       align = "ldddddd", 
                                       output = 'gt'
  )
  
  # display
  summary_dyads
  

  
}



## Table 2: Individual-level stats ---------------


# reshape
indiv_complete_df <- dyads_df  %>% 
  select(anon_id1, anon_id2, sample_type)  %>% 
  pivot_longer(c(anon_id1, anon_id2), names_to = "order", values_to = "anon_id")  %>% 
  distinct(anon_id, .keep_all = TRUE)  %>% 
  mutate(dyad_type = "Complete")

# summarize
indiv_summary_df <- id18_random_df  %>% 
  mutate(sample_choice = "Random")  %>% 
  bind_rows(id18_selected_df)  %>% 
  mutate(sample_choice = replace_na(sample_choice, "Self-selected"))  %>% 
  distinct(anon_id, .keep_all = TRUE)  %>% 
  left_join(indiv18_df) %>% 
  left_join(indiv_complete_df)  %>% 
  mutate(
    n_turnout = turnout_2016 + turnout_2018 + turnout_2020,
    dyad_type = replace_na(dyad_type, "Incomplete"),
    sample_combined = case_when(
      sample_choice == "Random" & dyad_type == "Complete" & sample_type == "Random" ~ "Random, Complete",
      sample_choice == "Random"  ~ "Random, Partial",
      sample_choice ==  "Self-selected" ~  "Self-selected"
    ))




# Rename and recode variables
indiv_balance_df <- indiv_summary_df %>%
  mutate(across(c(black, hispanic, white), ~as.integer(.x)))  %>% 
  select(
    sample_combined,
    `Turnout, 2016` = turnout_2016,
    `Turnout, 2018` = turnout_2018,
    `Turnout, 2020` = turnout_2020,
    `Turnout Count, 2016-2020` = n_turnout,
    `Gender = Woman` = female,
    `Race/Ethnicity = Black` = black,
    `Race/Ethnicity = White, non-Hispanic` = white,
    `Race/Ethnicity = Hispanic` = hispanic
  )  
row_names <- names(indiv_balance_df)  %>% paste(collapse = "|^")

summary_indiv <- datasummary_balance(~sample_combined,
                    data = indiv_balance_df,
                    fmt = fmt_decimal(digits = 2, pdigits = 2),
                    align="ldddddd",
                    dinm = FALSE,
                    output = 'gt'
)  

# display
summary_indiv





## Table 3: compare FSU students to college students nationally ---------------

tufts_fmt_df <- tufts_df  %>% 
  mutate(Group = str_replace(Group, "FSU", "Florida State University"))  %>% 
  rename(gender_woman = prop_female)  %>% 
  rename_with(~str_replace(.x, "prop", "Race/Ethnicity"))  %>% 
  rename_with(~str_replace(.x, "_", " "))  %>% 
  rename_with(str_to_title)  


summary_tufts <-
  datasummary_df(
    data = tufts_fmt_df,
    fmt = 2,
    output = "gt",
    dinm = FALSE
  ) %>%
  tab_spanner_delim(delim = " ") |>
  sub_missing(missing_text = "---")

# display
summary_tufts





# 5. Save ========================

# Table 1
# gtsave(summary_dyads, filename = "Results/summary_dyads.docx")

# Table 2
gtsave(summary_tufts, filename = "Results/summary_tufts.docx")

# Table 3
gtsave(summary_indiv, filename = "Results/summary_indiv.docx")
